vote_lm = lm(trump.vote~metro+non.citizen+unemployed, data=election2016)
summary(vote_lm)
##
## Call:
## lm(formula = trump.vote ~ metro + non.citizen + unemployed, data = election2016)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27302 -0.04527 -0.00485 0.06165 0.14827
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.62064 0.06793 9.136 6.65e-12 ***
## metro -0.18211 0.10118 -1.800 0.0784 .
## non.citizen -1.25402 0.57049 -2.198 0.0330 *
## unemployed 1.58516 1.15351 1.374 0.1760
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08011 on 46 degrees of freedom
## Multiple R-squared: 0.4072, Adjusted R-squared: 0.3686
## F-statistic: 10.53 on 3 and 46 DF, p-value: 2.157e-05
Seems like non.citizen feature is the only one that is significant based on 0.033 p-value.
reduced_vote_lm = lm(trump.vote~non.citizen, data=election2016)
summary(reduced_vote_lm)
##
## Call:
## lm(formula = trump.vote ~ non.citizen, data = election2016)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.248506 -0.051852 -0.003506 0.067084 0.145411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5979 0.0225 26.569 < 2e-16 ***
## non.citizen -1.9392 0.3783 -5.126 5.25e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08189 on 48 degrees of freedom
## Multiple R-squared: 0.3537, Adjusted R-squared: 0.3403
## F-statistic: 26.27 on 1 and 48 DF, p-value: 5.249e-06
anova(reduced_vote_lm, vote_lm)
## Analysis of Variance Table
##
## Model 1: trump.vote ~ non.citizen
## Model 2: trump.vote ~ metro + non.citizen + unemployed
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 48 0.32188
## 2 46 0.29522 2 0.026657 2.0768 0.1369
Since the p value is greater than 0.05, we can conclude that adding other predicters does not improve our model
all_vote_lm = lm(trump.vote~. -state, data=election2016)
summary(all_vote_lm)
##
## Call:
## lm(formula = trump.vote ~ . - state, data = election2016)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.154253 -0.028060 0.000814 0.031951 0.091114
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.837e+00 6.129e-01 2.998 0.00466 **
## median.income -1.003e-06 1.882e-06 -0.533 0.59697
## unemployed 2.430e-01 9.441e-01 0.257 0.79821
## metro -1.660e-01 6.928e-02 -2.397 0.02132 *
## high.school -1.049e+00 4.620e-01 -2.271 0.02863 *
## non.citizen 6.664e-02 4.391e-01 0.152 0.88013
## white.pov -7.551e-01 5.425e-01 -1.392 0.17163
## income.ineq -4.555e-01 6.465e-01 -0.705 0.48520
## non.white -1.916e-01 6.987e-02 -2.741 0.00910 **
## outcomeRed 1.201e-01 2.049e-02 5.860 7.44e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04835 on 40 degrees of freedom
## Multiple R-squared: 0.8122, Adjusted R-squared: 0.77
## F-statistic: 19.22 on 9 and 40 DF, p-value: 6.64e-12
anova(reduced_vote_lm, all_vote_lm)
## Analysis of Variance Table
##
## Model 1: trump.vote ~ non.citizen
## Model 2: trump.vote ~ (state + median.income + unemployed + metro + high.school +
## non.citizen + white.pov + income.ineq + non.white + outcome) -
## state
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 48 0.32188
## 2 40 0.09352 8 0.22835 12.208 1.234e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova test also confirms that the full model is better than the reduced model since we have a very low p-value. In this model high.school and outcomeRed were significant predictors. And we can also observe that R2 value almost doubled compared to previous SLM. This means that the final model performs much better.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
election_3d = plot_ly(election2016, x=~high.school, y=~outcome, z=~trump.vote)
election_3d %>% add_markers
Seems like with the blue outcome, the high.school numbers are majorly higher than red outcome.
gapminder <- read.csv("~/Documents/math133/datasets/gapminder.csv")
filtered_gapminder <- gapminder %>%
filter(year == 2007)
head(filtered_gapminder)
## country year pop continent gdpPercap lifeExp
## 1 Afghanistan 2007 31889923 Asia 974.5803 43.828
## 2 Albania 2007 3600523 Europe 5937.0295 76.423
## 3 Algeria 2007 33333216 Africa 6223.3675 72.301
## 4 Angola 2007 12420476 Africa 4797.2313 42.731
## 5 Argentina 2007 40301927 Americas 12779.3796 75.320
## 6 Australia 2007 20434176 Oceania 34435.3674 81.235
library(ggplot2)
ggplot(filtered_gapminder, aes(x = continent, y = lifeExp)) +
geom_bar(stat = "identity") +
labs(title = "Bar Chart", x = "continent", y = "lifeExp") +
theme_minimal()
oceania_out <- filtered_gapminder %>%
filter(continent != "Oceania")
ggplot(oceania_out, aes(x = continent, y = lifeExp)) +
geom_bar(stat = "identity") +
labs(title = "Bar Chart", x = "continent", y = "lifeExp") +
theme_minimal()
### b)
gapminder_lm = lm(lifeExp~pop+gdpPercap+continent, data=oceania_out)
summary(gapminder_lm)
##
## Call:
## lm(formula = lifeExp ~ pop + gdpPercap + continent, data = oceania_out)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.8100 -3.0434 0.1563 2.9280 20.0551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.371e+01 9.389e-01 57.205 < 2e-16 ***
## pop 9.697e-10 3.940e-09 0.246 0.806
## gdpPercap 3.487e-04 5.747e-05 6.068 1.25e-08 ***
## continentAmericas 1.603e+01 1.677e+00 9.553 < 2e-16 ***
## continentAsia 1.256e+01 1.627e+00 7.717 2.44e-12 ***
## continentEurope 1.518e+01 1.974e+00 7.689 2.84e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.62 on 134 degrees of freedom
## Multiple R-squared: 0.7088, Adjusted R-squared: 0.698
## F-statistic: 65.24 on 5 and 134 DF, p-value: < 2.2e-16
We can say that gdpPercap is an important mf :) bc the p-value is low. The p-value of pop is very big, so we can say that it can fuck off. Its coefficient is anyways very small. The baseline category for continent is Africa. The features continentAmericas, continentAsia, continentEurope has very low p-values with 1.603e+01, 1.256e+01, 1.518e+01 coefficients respectively. This coefficients indicate how model performs compared to continentAfrica which is the baseline category.
oceania_out %>% ggplot(aes(x=gdpPercap,y=continent))+geom_point(shape=1)
we can say that in the order Africa, Americas, Asia and Europe, GDP of
values grow.
gapminder_reduced_lm = lm(lifeExp~gdpPercap+continent, data=oceania_out)
summary(gapminder_reduced_lm)
##
## Call:
## lm(formula = lifeExp ~ gdpPercap + continent, data = oceania_out)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.9058 -2.8987 0.1394 2.9136 20.0445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.373e+01 9.317e-01 57.674 < 2e-16 ***
## gdpPercap 3.474e-04 5.703e-05 6.092 1.09e-08 ***
## continentAmericas 1.605e+01 1.668e+00 9.625 < 2e-16 ***
## continentAsia 1.266e+01 1.563e+00 8.103 2.85e-13 ***
## continentEurope 1.521e+01 1.964e+00 7.746 2.02e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.597 on 135 degrees of freedom
## Multiple R-squared: 0.7087, Adjusted R-squared: 0.7001
## F-statistic: 82.11 on 4 and 135 DF, p-value: < 2.2e-16
anova(gapminder_reduced_lm, gapminder_lm)
## Analysis of Variance Table
##
## Model 1: lifeExp ~ gdpPercap + continent
## Model 2: lifeExp ~ pop + gdpPercap + continent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 135 5875.6
## 2 134 5872.9 1 2.655 0.0606 0.806
gapminder_lm_multiplicative = lm(lifeExp~gdpPercap*continent, data=oceania_out)
summary(gapminder_lm)
##
## Call:
## lm(formula = lifeExp ~ pop + gdpPercap + continent, data = oceania_out)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.8100 -3.0434 0.1563 2.9280 20.0551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.371e+01 9.389e-01 57.205 < 2e-16 ***
## pop 9.697e-10 3.940e-09 0.246 0.806
## gdpPercap 3.487e-04 5.747e-05 6.068 1.25e-08 ***
## continentAmericas 1.603e+01 1.677e+00 9.553 < 2e-16 ***
## continentAsia 1.256e+01 1.627e+00 7.717 2.44e-12 ***
## continentEurope 1.518e+01 1.974e+00 7.689 2.84e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.62 on 134 degrees of freedom
## Multiple R-squared: 0.7088, Adjusted R-squared: 0.698
## F-statistic: 65.24 on 5 and 134 DF, p-value: < 2.2e-16
anova(gapminder_lm_multiplicative, gapminder_reduced_lm)
## Analysis of Variance Table
##
## Model 1: lifeExp ~ gdpPercap * continent
## Model 2: lifeExp ~ gdpPercap + continent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 132 5474.8
## 2 135 5875.6 -3 -400.83 3.2214 0.02485 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Seems like adding the multiplication of gdpPercap and continent improves the reduced model slightly.
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point(alpha = 0.6) + # Adds scatter points with slight transparency
labs(
title = "Life Expectancy vs GDP per Capita",
x = "GDP per Capita (log scale)",
y = "Life Expectancy",
color = "Continent"
) +
theme_minimal()
we see that as life expectancy goes higher, in continents where GDP is
higher accordingly.
ggplot(gapminder, aes(x = log(gdpPercap), y = lifeExp, color = continent)) +
geom_point(alpha = 0.6) + # Adds scatter points with slight transparency
labs(
title = "Life Expectancy vs GDP per Capita",
x = "GDP per Capita (log scale)",
y = "Life Expectancy",
color = "Continent"
) +
theme_minimal()
Yes, we still see that there is a growth in life expectancy in the
continents with higher GDP